Podsumowanie
1. Wyliczanie bibliotek
usePackage <- function(p) {
if (!is.element(p, installed.packages()[,1]))
install.packages(p, dep = TRUE, verbose = FALSE)
require(p, character.only = TRUE, quietly = TRUE)
}
packageList <- c("dplyr", "matrixStats", "tidyr", "caret", "ggcorrplot", "plotly")
print("Użyte biblioteki to:")
## [1] "Uzyte biblioteki to:"
packageList
## [1] "dplyr" "matrixStats" "tidyr" "caret" "ggcorrplot"
## [6] "plotly"
sapply(packageList, usePackage)
## dplyr matrixStats tidyr caret ggcorrplot plotly
## TRUE TRUE TRUE TRUE TRUE TRUE
2. Zapewnianie powtarzalności obliczeń
setConsistency <- function() {
set.seed(42)
}
3. Ładowanie danych
loadData <- function(urlPath, localFile = 'life_expectancy.tsv') {
remote_data_file <- download.file(urlPath, destfile = localFile, quiet= TRUE)
return(read.csv(TMP_LOCAL_FILE, quote='"'))
}
FILE_URL <- 'http://www.cs.put.poznan.pl/alabijak/emd/projekt/Life_Expectancy_Data.csv'
TMP_LOCAL_FILE <- 'life_expectancy.csv'
data <- loadData(FILE_URL, TMP_LOCAL_FILE)
4. Przetwarzanie brakujących danych
fixData <- function(df) {
# Rename data
fixedData <- rename(df,
Life.Expectancy = Life.expectancy,
Infant.Deaths = infant.deaths,
Percentage.Expenditure = percentage.expenditure,
Under.Five.Deaths = under.five.deaths,
Total.Expenditure = Total.expenditure,
Thinness.Years.10.19 = thinness..1.19.years,
Thinness.Years.5.9 = thinness.5.9.years,
Income.Resources.Composition = Income.composition.of.resources)
fixedData <- within(fixedData, Alcohol[is.na(Alcohol) & Year == 2015] <- 0)
fixedData <- within(fixedData, Total.Expenditure[is.na(Total.Expenditure) & Year == 2015] <- 0)
# fixedData$Country <- as.factor(fixedData$Country)
# fixedData$Status <- as.factor(fixedData$Status)
return(fixedData)
}
cleanData <- function (df) {
return(na.omit(df))
}
removeChrColumns <- function(df) {
library(dplyr)
return(df %>%
select_if(~!is.character(.)))
}
fixedData <- fixData(data)
cleanedData <- cleanData(fixedData)
numericData <- removeChrColumns(cleanedData)
5. Rozmiar zbioru i podstawowe statystyki
print(paste('Zbiór danych zawiera', nrow(cleanedData), 'przykladów.'))
## [1] "Zbiór danych zawiera 1777 przykladów."
print(paste('Każdy z przykladow posiada', ncol(cleanedData), 'atrybutów.'))
## [1] "Kazdy z przykladow posiada 22 atrybutów."
library(dplyr)
library(tidyr)
cleanedData %>%
tibble::as_tibble() %>%
select(Life.Expectancy, Adult.Mortality, Infant.Deaths, Population) %>%
summarise(across(everything(), list(mean = mean, median = median, min = min, max = max), .names = "{.col}_{.fn}")) %>%
gather(variable, value) %>%
separate(variable, c("var", "funkcja"), sep = "\\_") %>%
spread(var, value) %>%
relocate(Life.Expectancy, .after = funkcja)
## # A tibble: 4 x 5
## funkcja Life.Expectancy Adult.Mortality Infant.Deaths Population
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 max 89 723 1600 1293859294
## 2 mean 69.4 168. 32.2 14430006.
## 3 median 71.7 148 3 1435568
## 4 min 44 1 0 34
6. Szczegółowa analiza wartości atrybutów
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
7. Korelacje między zmiennymi
library(ggcorrplot)
corr <- round(cor(numericData), 1)
p.mat <- cor_pmat(numericData)
ggcorrplot(corr, p.mat = p.mat, hc.order=TRUE, type='full', tl.cex=10, tl.srt=60)

8. Długość życia w zależności od krajów
library(plotly)
countryNames <- unique(cleanedData$Country)
countryData <- cleanedData %>%
filter(Life.Expectancy != 0) %>%
group_by(Country) %>%
dplyr::summarize(Mean.Life.Expectancy = mean(Life.Expectancy), .groups = 'drop')
countryData <- arrange(countryData, desc(countryData$Mean.Life.Expectancy))
fig <- plot_ly(countryData, x = ~Country, y = ~Mean.Life.Expectancy, type = 'bar')
fig
9. Regresor
library(caret)
setConsistency()
inTraining <-
createDataPartition(
y = numericData$Life.Expectancy,
p = .75,
list = FALSE,)
trainingSet <- numericData[inTraining,]
testingSet <- numericData[-inTraining,]
rfGrid <- expand.grid(mtry = seq(2, ncol(numericData) - 1))
gridCtrl <- trainControl(
method = "repeatedcv",
number = 2,
repeats = 5)
fitTune <- train(Life.Expectancy ~ .,
data = trainingSet,
method = "rf",
metric = "RMSE",
preProc = c("center", "scale"),
trControl = gridCtrl,
tuneGrid = rfGrid,
ntree = 10)
fitTune
## Random Forest
##
## 1334 samples
## 19 predictor
##
## Pre-processing: centered (19), scaled (19)
## Resampling: Cross-Validated (2 fold, repeated 5 times)
## Summary of sample sizes: 666, 668, 667, 667, 667, 667, ...
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 2 2.865870 0.8981423 2.008806
## 3 2.611478 0.9138562 1.823620
## 4 2.496997 0.9209252 1.729448
## 5 2.414434 0.9260085 1.651294
## 6 2.334129 0.9300500 1.604995
## 7 2.349975 0.9291291 1.599798
## 8 2.281503 0.9329390 1.568389
## 9 2.303874 0.9315994 1.572533
## 10 2.300910 0.9315208 1.563833
## 11 2.288248 0.9320849 1.549882
## 12 2.274257 0.9330773 1.553570
## 13 2.288128 0.9322114 1.554789
## 14 2.282676 0.9322473 1.545455
## 15 2.297981 0.9313793 1.561153
## 16 2.286173 0.9322201 1.551356
## 17 2.328094 0.9294172 1.578184
## 18 2.300977 0.9314149 1.557778
## 19 2.324170 0.9297220 1.572661
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 12.
ggplot(fitTune) + theme_bw()

rfTuneClasses <- predict(fitTune,
newdata = testingSet)
postResample(
as.numeric(rfTuneClasses),
testingSet$Life.Expectancy)
## RMSE Rsquared MAE
## 1.9878760 0.9487156 1.2810903